# Ensure requisite package downloads occur in task directory# This is necessary since oligo attempts to install an annotations package when loading raw files.# This prevent permissions issues for installing and prevents side effects of processing a given dataset. # (i.e. changes to more permanent package installations).libPaths(c(getwd(), .libPaths()))library(dplyr)# Ensure infix operator is available, methods should still reference dplyr namespace otherwiseoptions(dplyr.summarise.inform =FALSE)# Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.'if(is.null(params$runsheet)){stop("PARAMETERIZATION ERROR: Must supply runsheet path")}runsheet=params$runsheet# <path/to/runsheet>message(params)## Set up output structure# Output ConstantsDIR_RAW_DATA<-"00-RawData"DIR_NORMALIZED_EXPRESSION<-"01-oligo_NormExp"DIR_DGE<-"02-limma_DGE"dir.create(DIR_RAW_DATA)dir.create(DIR_NORMALIZED_EXPRESSION)dir.create(DIR_DGE)## Save original par settings## Par may be temporarily changed for plotting purposes and reset once the plotting is doneoriginal_par<-par()options(preferRaster=TRUE)# use Raster when possible to avoid antialiasing artifacts in imagesoptions(timeout=1000)
# Utility function to improve robustness of function calls# Used to remedy intermittent internet issues during runtime retry_with_delay<-function(func, ...){max_attempts=5initial_delay=10delay_increase=30attempt<-1current_delay<-initial_delaywhile(attempt<=max_attempts){result<-tryCatch( expr =func(...), error =function(e)e)if(!inherits(result, "error")){return(result)}else{if(attempt<max_attempts){message(paste("Retry attempt", attempt, "failed for function with name <", deparse(substitute(func)) ,">. Retrying in", current_delay, "second(s)..."))Sys.sleep(current_delay)current_delay<-current_delay+delay_increase}else{stop(paste("Max retry attempts reached. Last error:", result$message))}}attempt<-attempt+1}}df_rs<-read.csv(runsheet, check.names =FALSE)%>%dplyr::mutate_all(function(x)iconv(x, "latin1", "ASCII", sub=""))# Convert all characters to ascii, when not possible, remove the character## Determines the organism specific annotation file to use based on the organism in the runsheetfetch_organism_specific_annotation_file_path<-function(organism){# Uses the GeneLab GL-DPPD-7110_annotations.csv file to find the organism specific annotation file path# Raises an exception if the organism does not have an associated annotation file yetall_organism_table<-read.csv("https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv")annotation_file_path<-all_organism_table%>%dplyr::filter(species==organism)%>%dplyr::pull(genelab_annots_link)# Guard clause: Ensure annotation_file_path populated# Else: raise exception for unsupported organismif(length(annotation_file_path)==0){stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row"))}return(annotation_file_path)}annotation_file_path<-retry_with_delay(fetch_organism_specific_annotation_file_path, unique(df_rs$organism))# NON_DPPD:STARTprint("Here is the embedded runsheet")
# NON_DPPD:ENDprint("Loading Raw Data...")# NON_DPPD
[1] "Loading Raw Data..."
Code
allTrue<-function(i_vector){if(length(i_vector)==0){stop(paste("Input vector is length zero"))}all(i_vector)}# Define paths to raw data filesrunsheetPathsAreURIs<-function(df_runsheet){allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))}# Download raw data filesdownloadFilesFromRunsheet<-function(df_runsheet){urls<-df_runsheet$`Array Data File Path`destinationFiles<-df_runsheet$`Array Data File Name`mapply(function(url, destinationFile){print(paste0("Downloading from '", url, "' TO '", destinationFile, "'"))if(file.exists(destinationFile)){warning(paste("Using Existing File:", destinationFile))}else{download.file(url, destinationFile)}}, urls, destinationFiles)destinationFiles# Return these paths}if(runsheetPathsAreURIs(df_rs)){print("Determined Raw Data Locations are URIS")local_paths<-retry_with_delay(downloadFilesFromRunsheet, df_rs)}else{print("Or Determined Raw Data Locations are local paths")local_paths<-df_rs$`Array Data File Path`}
[1] "Determined Raw Data Locations are URIS"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_14R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_14R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_16R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_16R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_20R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_20R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_52R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_52R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_54R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_54R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_58R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_58R_(Mouse430_2).CEL.gz'"
Code
# uncompress files if neededif(allTrue(stringr::str_ends(local_paths, ".gz"))){print("Determined these files are gzip compressed... uncompressing now")# This does the uncompressionlapply(local_paths, R.utils::gunzip, remove =FALSE, overwrite =TRUE)# This removes the .gz extension to get the uncompressed filenameslocal_paths<-vapply(local_paths, stringr::str_replace, # Run this function against each item in 'local_paths' FUN.VALUE =character(1), # Execpt an character vector as a return USE.NAMES =FALSE, # Don't use the input to assign names for the returned list pattern =".gz$", # first argument for applied function replacement =""# second argument for applied function)}
[1] "Determined these files are gzip compressed... uncompressing now"
# NON_DPPD:END# Load raw data into R object# Retry with delay here to accomodate oligo's automatic loading of annotation packages and occasional internet related failures to loadraw_data<-retry_with_delay(oligo::read.celfiles,df_local_paths$`Local Paths`, sampleNames =df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames))
Reading in : GLDS-87_microarray_14R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_16R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_20R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_52R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_54R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_58R_(Mouse430_2).CEL
message(paste0("Number of Arrays: ", dim(raw_data)[2]))# NON_DPPDmessage(paste0("Number of Probes: ", dim(raw_data)[1]))# NON_DPPD# NON_DPPD:STARTDT::datatable(raw_data$targets, caption ="Sample to File Mapping")
Code
DT::datatable(head(raw_data$genes, n =20), caption ="First 20 rows of raw data file embedded probes to genes table")
Code
# NON_DPPD:END
3 QA For Raw Data
3.1 Density Plot
Code
# Plot settingspar( xpd =TRUE# Ensure legend can extend past plot area)number_of_sets=ceiling(dim(raw_data)[2]/30)# Set of 30 samples, used to scale plotoligo::hist(raw_data, transfo=log2, # Log2 transform raw intensity values which=c("all"), nsample=10000, # Number of probes to plot main ="Density of raw intensities for multiple arrays")legend("topright", legend =colnames(raw_data@assayData$exprs), lty =c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types col =oligo::darkColors(n =ncol(raw_data)), # Ensure legend color is in sync with plot ncol =number_of_sets, # Set number of columns by number of sets cex =max(0.35, 1+0.2-(number_of_sets*0.2))# Reduce scale by 20% for each column beyond 1 with minimum of 35%)
Density of raw intensities for each array. A lack of overlap indicates a need for normalization.
for(iinseq_along(1:ncol(raw_data))){message(glue::glue("Drawing Psuedoimage for {colnames(raw_data)[i]}"))# NON_DPPDoligo::image(raw_data[,i], transfo =log2, main =colnames(raw_data)[i])}
3.3 MA Plots
Code
if(inherits(raw_data, "GeneFeatureSet")){print("Raw data is a GeneFeatureSet, using exprs() to access expression values and adding 0.0001 to avoid log(0)")}elseif(inherits(raw_data, "ExpressionSet")){print("Raw data is an ExpressionSet. Using default approach for this class for MA Plot")}elseif(inherits(raw_data, "ExpressionFeatureSet")){print("Raw data is an ExpressionFeatureSet. Using default approach for this class for MA Plot")}
[1] "Raw data is an ExpressionFeatureSet. Using default approach for this class for MA Plot"
M = Expression log-ratio (this sample vs. pseudo median reference chip)
A = Average log-expression
Code
if(inherits(raw_data, "GeneFeatureSet")){MA_plot<-oligo::MAplot(exprs(raw_data)+0.0001, transfo=log2, ylim=c(-2, 4), main=""# This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string)}elseif(inherits(raw_data, "ExpressionSet")){MA_plot<-oligo::MAplot(raw_data, ylim=c(-2, 4), main=""# This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string)}elseif(inherits(raw_data, "ExpressionFeatureSet")){MA_plot<-oligo::MAplot(raw_data, ylim=c(-2, 4), main=""# This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string)}else{stop(glue::glue("No strategy for MA plots for {raw_data}"))}
3.4 Boxplots
Code
max_samplename_length<-max(nchar(colnames(raw_data)))dynamic_lefthand_margin<-max(max_samplename_length*0.7, 10)par( mar =c(8, dynamic_lefthand_margin, 8, 2)+0.1, # mar is the margin around the plot. c(bottom, left, top, right) xpd =TRUE)boxplot<-oligo::boxplot(raw_data[, rev(colnames(raw_data))], # Here we reverse column order to ensure descending order for samples in horizontal boxplot transfo=log2, # Log2 transform raw intensity values which=c("all"), nsample=10000, # Number of probes to plot las =1, # las specifies the orientation of the axis labels. 1 = always horizontal ylab="", xlab="log2 Intensity", main ="Boxplot of raw intensities \nfor perfect match and mismatch probes", horizontal =TRUE)title(ylab ="Sample Name", mgp =c(dynamic_lefthand_margin-2, 1, 0))
# Normalize background-corrected data using the quantile methodnorm_data<-oligo::normalize(background_corrected_data, method ="quantile", target ="core"# Use oligo default: core metaprobeset mappings)# Summarize background-corrected and normalized dataprint("Summarized Normalized Data Below")# NON_DPPD
message(paste0("Number of Arrays: ", dim(norm_data)[2]))# NON_DPPDmessage(paste0("Number of Probes: ", dim(norm_data)[1]))# NON_DPPD# NON_DPPD:STARTDT::datatable(raw_data$targets, caption ="Sample to File Mapping")
Code
DT::datatable(head(raw_data$genes, n =20), caption ="First 20 rows of raw data file embedded probes to genes table")
Code
# NON_DPPD:END
6 QA For Normalized Data
6.1 Density Plot
Code
# Plot settingspar( xpd =TRUE# Ensure legend can extend past plot area)number_of_sets=ceiling(dim(norm_data)[2]/30)# Set of 30 samples, used to scale plotoligo::hist(norm_data, transfo=log2, # Log2 transform normalized intensity values which=c("all"), nsample=10000, # Number of probes to plot main ="Density of normalized intensities for multiple arrays")legend("topright", legend =colnames(norm_data@assayData$exprs), lty =c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types col =oligo::darkColors(n =ncol(norm_data)), # Ensure legend color is in sync with plot ncol =number_of_sets, # Set number of columns by number of sets cex =max(0.35, 1+0.2-(number_of_sets*0.2))# Reduce scale by 20% for each column beyond 1)
Density of normalized intensities for each array. Compared to the raw data density plot, array densities should overlap more.
for(iinseq_along(1:ncol(norm_data))){message(glue::glue("Drawing Psuedoimage for {colnames(norm_data)[i]}"))# NON_DPPDoligo::image(norm_data[,i], transfo =log2, main =colnames(norm_data)[i])}